#load all the required libraries
import re # for regular expressions
import nltk # for text manipulation
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# for advanced visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
from sklearn.model_selection import StratifiedKFold
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline
from google.colab import files
uploaded =files.upload()
#reading the dataset
data = pd.read_csv('amazon_alexa.tsv',sep='\t')
data.head()
data.describe()
data.isnull().any().any()
#lets create a varieable length for the data
data['length'] =data['verified_reviews'].apply(len)
#describe the data according to the rating
data.groupby('rating').describe()
#describing the data with feedback
data.groupby('feedback').describe()
def configure_plotly_browser_state():
import IPython
display(IPython.core.display.HTML('''
<script src="/static/components/requirejs/require.js"></script>
<script>
requirejs.config({
paths: {
base: '/static/base',
plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
},
});
</script>
'''))
configure_plotly_browser_state()
init_notebook_mode(connected=False)
ratings = data['rating'].value_counts()
label_rating = ratings.index
size_rating = ratings.values
colors = ['pink', 'lightblue', 'aqua', 'gold', 'crimson']
rating_piechart = go.Pie(labels = label_rating,
values = size_rating,
marker = dict(colors = colors),
name = 'Alexa', hole = 0.3)
df = [rating_piechart]
layout = go.Layout(
title = 'Distribution of Ratings for Alexa')
fig = go.Figure(data = df,
layout = layout)
py.iplot({"data":df,"layout":layout})
By looking at the above pie chart, we can infer that most of the Ratings are good for alexa. Around 72.6% people have given Alexa 5 Star rating, which is very good. 14.4% people have given Alexa a 4 Star Rating, which is also good. that means 72.6+14.4 = 87% people have given alexa good rating. 4.38% people have given alexa an average rating of 3 stars. 3.05% people did not like alexa and chose to give only 2 star ratings to alexa whereas 5.11% people hated alexa and decided to give alexa only 1 Star Rating. This a total of 3.05+5.11 = 8.16% people did not like alexa.
data['variation'].value_counts()
data['variation'].value_counts().plot.bar(figsize=(15,6))
plt.title("Distribution of Alexa's variation")
plt.xlabel("variations")
plt.ylabel("count")
plt.show()
The above bar plot show's the popularity of the variation of Alexa. It clearly state that Black Dot is the most popular variation among all.Followed by Charcoal Fabric.Some of the unpopular variations of alexa are Oak Finish and Walnut Finish whose count is very less the dataset signifying rare use of this variations
#distributions of feedback of alexa's
configure_plotly_browser_state()
init_notebook_mode(connected=False)
feedback = data['feedback'].value_counts()
label_rating = feedback.index
size_rating = feedback.values
colors = ['red', 'blue']
rating_piechart = go.Pie(labels = label_rating,
values = size_rating,
marker = dict(colors = colors),
name = 'Alexa', hole = 0.3)
df = [rating_piechart]
layout = go.Layout(
title = 'Distribution of Ratings for Alexa')
fig = go.Figure(data = df,
layout = layout)
py.iplot({"data":df,"layout":layout})
It can be observed that a sum total of 91.8% of the population are loving Alexa by giving positive rating.While only 8.16% are not satisfied with the Alexa which is very less as compared to the positive ratings
data['length_range'] = pd.cut(data.length,[x for x in range(1,1500,50)])
data['length_range'].value_counts().plot.bar(figsize =(15,6))
plt.title("review length distributions")
#variations vs rating
plt.rcParams['figure.figsize'] = (15, 9)
sns.boxenplot(x='variation',y='rating',data =data,palette="spring")
plt.title("Variation vs Rating")
plt.xticks(rotation=90)
plt.show()
plt.rcParams['figure.figsize'] = (12, 7)
plt.style.use('fivethirtyeight')
sns.violinplot(data['feedback'], data['rating'], palette = 'cool')
plt.title("feedback wise Mean Ratings")
plt.show()
#word counts
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
words =cv.fit_transform(data.verified_reviews)
sum_words = words.sum(axis=0)
words_freq =[(word,sum_words[0,idx]) for word,idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq,key = lambda x:x[1],reverse=True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])
plt.style.use('fivethirtyeight')
color = plt.cm.ocean(np.linspace(0, 1, 20))
frequency.head(20).plot(x='word', y='freq', kind='bar', figsize=(15, 6), color = color)
plt.title("Most Frequently Occuring Words - Top 20")
plt.show()
from wordcloud import WordCloud
word_cloud = WordCloud(width=800,height=400,random_state=21,max_font_size=110).generate_from_frequencies(dict(words_freq))
plt.figure(figsize=(20,10))
plt.imshow(word_cloud,interpolation="bilinear")
plt.axis("off")
plt.show()
##modelling part
def remove_pattern(input_text,pattern):
r = re.findall(pattern,input_text)
for i in r:
input_text = re.sub(i,'',input_text)
return input_text
data['tidy_review'] ='a'
for i in range(len(data)):
data['tidy_review'][i] = remove_pattern(data['verified_reviews'][i],"@[\w]*")
data.head()
data['tidy_review'] = data['tidy_review'].str.replace("[^a-zA-Z#]", " ")
data.head(10)
data['tidy_review'] = data['tidy_review'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
data.head()
tokenized_review = data['tidy_review'].apply(lambda x: x.split())
tokenized_review.head(5)
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_review = tokenized_review.apply(lambda x:[stemmer.stem(i) for i in x])
tokenized_review.head(5)
for i in range(len(tokenized_review)):
tokenized_review[i] = ' '.join(tokenized_review[i])
data['tidy_review'] = tokenized_review
data.head()
##three methods of feature extraction from text is used
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import gensim
bow_data = CountVectorizer(max_df=0.9,min_df=2,max_features=1000,stop_words='english')
bow = bow_data.fit_transform(data['tidy_review'])
bow.shape
X_bow = bow
y_bow = data['feedback'].values
X_bow.shape,y_bow.shape
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_bow,y_bow,test_size =0.3,random_state=111)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
log_model = LogisticRegression()
log_model.fit(X_train,y_train)
y_pred = log_model.predict(X_test)
print(classification_report(y_test,y_pred))
validation with cross validation
cv_lreg1_val = cross_val_score(log_model,X_bow,y_bow,cv=StratifiedKFold())
cv_lreg1_val
AS we can see all the accuracy of the logistic regression model is perfoming better in the same range.So i assume that this model is free from overfitting and under fitting and is valid
creating new dataset by mergin sparese and dense dataset
#the probabillity predictied by our logistic model would be used as a features in the new dataset
log_text = log_model.predict_proba(X_bow)
log_data = data[['variation','feedback']]
log_data['text_proba1'] = log_text[:,0]
log_data['text_proba2'] = log_text[:,1]
X_log = log_data[['variation','text_proba1','text_proba2']]
y_log = log_data['feedback']
#making dummy variable for the variation
X_log = pd.get_dummies(X_log)
X_log.head()
kpca =KernelPCA(n_components=None)
X_log_pca = kpca.fit_transform(X_log)
pd.DataFrame(X_log_pca).head()
#split the data into train and test set and fit the model
X_train_log,X_test_log,y_train_log,y_test_log =train_test_split(X_log_pca,y_log,test_size =0.3,random_state=123)
log_model_final = LogisticRegression()
log_model_final.fit(X_train_log,y_train_log)
log_final_pred = log_model_final.predict(X_test_log)
print(accuracy_score(y_test_log,log_final_pred))
#validate the model with cross validation
k_fold = StratifiedKFold()
cv_log_final = cross_val_score(log_model_final,X_log_pca,y_log,cv=k_fold)
print(cv_log_final)
As we can see all the validation split is performing aproximatety same. So our model is valid
parameter tuning logistic model
from sklearn.model_selection import GridSearchCV
param_grid = [
{
'C' : np.logspace(-4, 4, 20)}
]
log_clf = GridSearchCV(LogisticRegression(),param_grid,cv=5)
log_clf.fit(X_log_pca,y_log)
log_clf.best_params_
log_model2_final = LogisticRegression(C=1.62377)
log_model2_final.fit(X_train_log,y_train_log)
log_final2_pred = log_model2_final.predict(X_test_log)
print(classification_report(y_test_log,log_final_pred))
So the final accuracy of the model is 97.0
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=6, n_estimators=1000, nthread= 3).fit(X_train,y_train)
xgb_pred = xgb.predict(X_test)
print(classification_report(y_test,xgb_pred))
cv_xgb_val = cross_val_score(xgb,X_bow,y_bow,cv=StratifiedKFold())
cv_xgb_val
#the probabillity predictied by our logistic model would be used as a features in the new dataset
xgb_text = xgb.predict_proba(X_bow)
xgb_data = data[['variation','feedback']]
xgb_data['text_proba1'] = xgb_text[:,0]
xgb_data['text_proba2'] = xgb_text[:,1]
X_xgb = xgb_data[['variation','text_proba1','text_proba2']]
y_xgb = xgb_data['feedback']
#making dummy variable for the variation
X_xgb = pd.get_dummies(X_xgb)
kpca =KernelPCA(n_components=None)
X_xgb_pca = kpca.fit_transform(X_xgb)
X_train_xgb,X_test_xgb,y_train_xgb,y_test_xgb =train_test_split(X_xgb_pca,y_xgb,test_size =0.3,random_state=123)
xgb_model_final = XGBClassifier(max_depth=6, n_estimators=1000, nthread= 3)
xgb_model_final.fit(X_train_xgb,y_train_xgb)
xgb_final_pred = xgb_model_final.predict(X_test_xgb)
print(accuracy_score(y_test_xgb,xgb_final_pred))
svm classifier
getting a fair accuracy of 97.03%^
#cross validation to for the model
k_fold = StratifiedKFold()
cv_xgb_final = cross_val_score(xgb_model_final,X_xgb_pca,y_xgb,cv=k_fold)
print(cv_xgb_final)
from sklearn.svm import SVC
svc = SVC(probability=True)
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_test)
print(classification_report(y_test,svc_pred))
k_fold = StratifiedKFold()
cv_svc = cross_val_score(svc,X_bow,y_bow,cv=k_fold)
print(cv_svc)
svc_text = svc.predict_proba(X_bow)
svc_data = data[['variation','feedback']]
svc_data['text_proba1'] = svc_text[:,0]
svc_data['text_proba2'] = svc_text[:,1]
X_svc = svc_data[['variation','text_proba1','text_proba2']]
y_svc = svc_data['feedback']
#making dummy variable for the variation
X_svc = pd.get_dummies(X_svc)
kpca =KernelPCA(n_components=None)
X_svc_pca = kpca.fit_transform(X_svc)
X_train_svc,X_test_svc,y_train_svc,y_test_svc =train_test_split(X_svc_pca,y_svc,test_size =0.3,random_state=123)
svc_model_final = SVC(kernel='poly')
svc_model_final.fit(X_train_svc,y_train_svc)
svc_final_pred = svc_model_final.predict(X_test_svc)
print(accuracy_score(y_test_svc,svc_final_pred))
k_fold = StratifiedKFold()
cv_svc_final = cross_val_score(svc_model_final,X_svc_pca,y_svc,cv=k_fold)
print(cv_svc_final)
###comparision of models
model =['logistic','xgboost','svm']
accuracy=[97,97.03,96.40]
plt.bar(x=model,height=accuracy,width=0.3,color=['red','blue','green'])
plt.xlabel("Model name")
plt.ylabel("Accuracy %")
plt.title("Comparision of models")
plt.show()